Codigo y salidas importantes

Corrida de blastp

blasteo <- function(filename, pathPan){
  query <- paste('-query', paste(pathPan, filename, sep = '/'))
  subject <- paste('-subject', paste(pathPan, filename, sep = '/'))
  cleanFileName <- gsub(".faa", '', filename)
  fileOut <- paste(cleanFileName, '-blastp.out', sep= '')
  out <- paste('-out', fileOut)
  blastp <- paste('blastp', query, subject, '-num_threads 5 -outfmt 7 -max_hsps 1 -use_sw_tback', out)
  system(blastp)
  return(0)
}

blasteo('ABC.faa', '/export/storage/users/aescobed/Bioinfo_LCG/Modulo-4/Proyecto')

Lectura de salida de blastp

#Solo se mantienen aquellas lineas que no sean comentarios
quitComments <- function(line){
    if (startsWith(x = line, prefix= '#') == FALSE) {
        return(line)}
    else {
        return(NA)}
}

#Convertir a un vector una linea separada por tabs 
splitLines <- function(line) {
    splitedLine <- as.vector(str_split(line, pattern= '\t', simplify =TRUE))
    return (splitedLine)
}

#Convertir un archivo output de blastp (tabular con comentarios) a un data frame sin comentarios
fileToDataFrame <- function(filename){
    fileContent <- readLines(filename)
    cleanFileContent <- as.vector(sapply(fileContent, quitComments))
    cleanestFileContent <- cleanFileContent[!is.na(cleanFileContent)]
    fragmentedFileContent <- sapply(cleanestFileContent, splitLines)
    preDataframeFileContent <- as.data.frame(fragmentedFileContent)
    if (dim(preDataframeFileContent)[1] > 0){
        names(preDataframeFileContent) <- NULL
        dataFrameFileContent <- as.data.frame(t(preDataframeFileContent))
        names(dataFrameFileContent) <- c('query_acc.ver', 'subject_acc.ver', '%_identity', 'alignment_length', 'mismatches', 'gap_opens', 'q._start', 'q._end', 's._start', 's. end', 'evalue', 'bit_score')
        return(dataFrameFileContent)}
    else {
        return(data.frame())}
}

data <- fileToDataFrame('ABC-blastp.out')
data

Matriz de distancias (con bitscores)

disMatrix <- matrix(rep(0,10000), nrow= 100, ncol=100)
protsNames <- unique(data$query_acc.ver)
colnames(disMatrix) <- protsNames
row.names(disMatrix) <- protsNames
for (protA in protsNames) {
    for (protB in protsNames) {
        if (protA != protB){
            filterData <- filter(data, query_acc.ver== protA, subject_acc.ver == protB)
            if (dim(filterData)[1] != 0){
            bitScore <- filterData$bit_score[1]
            disMatrix[protA, protB] <- as.numeric(bitScore)}
            else{
                disMatrix[protA, protB] <- as.numeric(0)}
            }
        else{
            disMatrix[protA, protB] <- as.numeric(1)}
        }
}

disMatrix[1:10, 1:10]
            ABC1-Q03024 ABC1-C1DS84 ABC1-Q54456 ABC1-Q9RHT2 ABC1-Q9ZG94
ABC1-Q03024           1         212         194         190         183
ABC1-C1DS84         199           1         199         200         191
ABC1-Q54456         192         212           1         193         189
ABC1-Q9RHT2         189         212         194           1         189
ABC1-Q9ZG94         187         206         193         191           1
ABC1-O67184         182         202         176         166         158
ABC1-Q9Z3C5         179         201         185         155         165
ABC1-O05693         173         184         174         161         173
ABC1-P23596         169         197         184         185         172
ABC1-O85350         167         189         173         162         160
            ABC1-O67184 ABC1-Q9Z3C5 ABC1-O05693 ABC1-P23596 ABC1-O85350
ABC1-Q03024         198         181         175         168         185
ABC1-C1DS84         202         187         170         184         189
ABC1-Q54456         192         185         177         181         191
ABC1-Q9RHT2         182         157         166         184         183
ABC1-Q9ZG94         172         167         175         171         177
ABC1-O67184           1         183         162         167         180
ABC1-Q9Z3C5         200           1         170         152         188
ABC1-O05693         172         172           1         160         178
ABC1-P23596         184         155         162           1         162
ABC1-O85350         180         171         168         142           1

Normalizacion de matriz

normDisMatrix <-matrix(rep(0,10000), nrow= 100, ncol=100)
colnames(normDisMatrix) <- protsNames
row.names(normDisMatrix) <- protsNames
maxBitScore <-  max(apply(disMatrix, 1, max))
for (protA in protsNames) {
    for (protB in protsNames) {
        if (protA != protB){
            norm <- disMatrix[protA, protB]/maxBitScore
            dis <- 1- norm
            normDisMatrix[protA, protB] <- dis
            }
        }
}

normDisMatrix[1:10, 1:10]
            ABC1-Q03024 ABC1-C1DS84 ABC1-Q54456 ABC1-Q9RHT2 ABC1-Q9ZG94
ABC1-Q03024   0.0000000   0.2262774   0.2919708   0.3065693   0.3321168
ABC1-C1DS84   0.2737226   0.0000000   0.2737226   0.2700730   0.3029197
ABC1-Q54456   0.2992701   0.2262774   0.0000000   0.2956204   0.3102190
ABC1-Q9RHT2   0.3102190   0.2262774   0.2919708   0.0000000   0.3102190
ABC1-Q9ZG94   0.3175182   0.2481752   0.2956204   0.3029197   0.0000000
ABC1-O67184   0.3357664   0.2627737   0.3576642   0.3941606   0.4233577
ABC1-Q9Z3C5   0.3467153   0.2664234   0.3248175   0.4343066   0.3978102
ABC1-O05693   0.3686131   0.3284672   0.3649635   0.4124088   0.3686131
ABC1-P23596   0.3832117   0.2810219   0.3284672   0.3248175   0.3722628
ABC1-O85350   0.3905109   0.3102190   0.3686131   0.4087591   0.4160584
            ABC1-O67184 ABC1-Q9Z3C5 ABC1-O05693 ABC1-P23596 ABC1-O85350
ABC1-Q03024   0.2773723   0.3394161   0.3613139   0.3868613   0.3248175
ABC1-C1DS84   0.2627737   0.3175182   0.3795620   0.3284672   0.3102190
ABC1-Q54456   0.2992701   0.3248175   0.3540146   0.3394161   0.3029197
ABC1-Q9RHT2   0.3357664   0.4270073   0.3941606   0.3284672   0.3321168
ABC1-Q9ZG94   0.3722628   0.3905109   0.3613139   0.3759124   0.3540146
ABC1-O67184   0.0000000   0.3321168   0.4087591   0.3905109   0.3430657
ABC1-Q9Z3C5   0.2700730   0.0000000   0.3795620   0.4452555   0.3138686
ABC1-O05693   0.3722628   0.3722628   0.0000000   0.4160584   0.3503650
ABC1-P23596   0.3284672   0.4343066   0.4087591   0.0000000   0.4087591
ABC1-O85350   0.3430657   0.3759124   0.3868613   0.4817518   0.0000000

Ejecucion y evaluacion de metodos de clustering

Obtencion de numero de clusters tentativos con el metodo silhouette

#Para clustering single 
fviz_nbclust(normDisMatrix, FUN = hcut, hc_func = "hclust", hc_method = "single", method = "silhouette", k.max = 10) +
  labs(subtitle = "Silhouette method")

#Para clustering average 
fviz_nbclust(normDisMatrix, FUN = hcut, hc_func = "hclust", hc_method = "average", method = "silhouette", k.max = 10) +
  labs(subtitle = "Silhouette method")

#Para clustering complete 
fviz_nbclust(normDisMatrix, FUN = hcut, hc_func = "hclust", hc_method = "complete", method = "silhouette", k.max = 10) +
  labs(subtitle = "Silhouette method")

fviz_nbclust(normDisMatrix, FUN = hcut, hc_func = "hclust", hc_method = "ward.D2", method = "silhouette", k.max = 10) +
  labs(subtitle = "Silhouette method")

Creacion de agrupamientos y arboles

csin <- hclust(dist(normDisMatrix, method= 'euclidean'), method = "single")
singleTree <- as.phylo(csin)
write.tree(phy=singleTree, file="single.tree")

cave <- hclust(dist(normDisMatrix, method= 'euclidean'), method = "average")
averageTree <- as.phylo(cave)
write.tree(phy= averageTree, file= "average.tree")

ccom <- hclust(dist(normDisMatrix, method= 'euclidean'), method = "complete")
completeTree <- as.phylo(ccom)
write.tree(phy= completeTree, file= "complete.tree")

cwar <- hclust(dist(normDisMatrix, method= 'euclidean'), method = "ward.D2")
wardTree <- as.phylo(cwar)
write.tree(phy= wardTree , file= "ward.tree")

Visualizacion de grupos

#Para clustering single 
clustersSingle <- cutree(csin, k=5)
fviz_cluster(list(data = normDisMatrix, cluster = clustersSingle))

#Para clustering average
clustersAverage <- cutree(cave, k=4)
fviz_cluster(list(data = normDisMatrix, cluster = clustersAverage))

#Para clustering complete 
clustersComplete <- cutree(ccom, k=4)
fviz_cluster(list(data = normDisMatrix, cluster = clustersAverage))

# Para clustering Ward
clustersWard <- cutree(cwar, k=4)
fviz_cluster(list(data = normDisMatrix, cluster = clustersAverage))

Visualizacion de arboles en R

plot (csin, hang = -1, main = "Single")
rect.hclust(csin, k=5,  border=1:16)

plot (cave, hang = -1, main = "Average")
rect.hclust(cave, k=4,  border=1:16)

plot (ccom, hang = -1, main = "Complete")
rect.hclust(ccom, k=5,  border=1:16)

plot (cwar, hang = -1, main = "Ward.D")
rect.hclust(cwar, k=4,  border=1:16)

Evaluacion con coef()

coefSingle <- coef(csin)
coefAverage <- coef(cave)
coefComplete <- coef(ccom)
coefWard <- coef(cwar)

dfCoefs <- data.frame(Metodo= c('Single', 'Average', 'Complete', 'Ward'), Coeficiente = c(coefSingle, coefAverage, coefComplete, coefWard))
dfCoefs